# -*- coding:utf-8 -*-
"""
该代码在splitWord2之后执行

针对sinahotnews2数据库！！！！
对于每一天的数据要进行去重处理，但是不同天的数据不能混合在一起去重，
因为一条新闻可能会连续在几天之内登录热度榜。
"""
import pymongo
import pandas as pd
import numpy as np
from datetime import datetime

mongoClient = pymongo.MongoClient("localhost", 27017)
db = mongoClient.sinafinance
collection = db.sinahotnews2
newCollection = db.sinahotnews3

# 针对数据库的date字段建立索引，加快查询速度
collection.create_index('date')

def dateList(beginDate, endDate):
    # 产生所有的日期, beginDate: str, endDate: str
    dateSimple = [datetime.strftime(x,"%Y%m%d") for x in list(pd.date_range(start=beginDate, end=endDate))]
    return dateSimple

allDateList = dateList('20130101', '20180331')
for date in allDateList:
    print(date)
    urlSet = set()
    for item in collection.find({'date':int(date)}):
        if item['url'] not in urlSet:
            urlSet.add(item['url'])
            newCollection.insert(item)

